The data were generated by the Duke University MS Core. The data were delivered as quality controled excel files. Data were modified to follow typical tabular format for easy loading into R.
ms.data <- read.delim("data/preprocessed.csv", sep = ",", check.names = F)
head(ms.data[, 1:10])
## Sample Timepoint Group Sex Age Treatment Infection VL C0 C2
## 1 M625 -21 IL21 F 674 None Naive 0 25.1 4.83
## 2 M625 7 IL21 F 674 None SIV 5369 33.7 12.90
## 3 M625 70 IL21 F 674 None SIV 39039 28.4 6.78
## 4 M625 91 IL21 F 674 IL21 SIV 50530 35.0 9.68
## 5 DHAE -21 IL21 M 1043 None Naive 0 56.4 10.40
## 6 DHAE 7 IL21 M 1043 None SIV 107360 26.9 15.70
We next need to format some of the columns so that their data types are correct. Namely converting the groups to a factor and the viral loads to a numeric.
ms.data$Group <- factor(ms.data$Group, levels = c("Control", "IL15", "aIL15", "IL21"))
ms.data$VL <- as.numeric(ms.data$VL)
The final step of these initial analyses is to define the amino acids that we are interested in. For this analysis we will examine all 20 amino acids and their changes during acute and chronic SIV infection.
amino.acids <- c("Ala", "Arg", "Asn", "Asp",
"Cys", "Glu", "Gln", "Gly",
"His", "Ile", "Leu", "Lys",
"Met", "Phe", "Pro", "Ser",
"Thr", "Trp", "Tyr", "Val")
Finally, we will load all the R libraries used for this analysis set.
require(ggplot2)
require(dplyr)
require(rpart)
require(rpart.plot)
require(pheatmap)
require(caret)
The first analysis we will perform is a simple heatmap analysis to examine expression of the 20 amino acids during the three timepoints we are interested in: Pre-infection, Acute infection, and Chronic infection.
Since there is a sample that did not have Day -21 and Day 70 timepoints but did have Day 0 and Day 50, we will modify the timepoint labels to include that sample. We will first copy the raw data into a new data frame without the Day 91 timepoint since that is not of interest for this analysis.
hm.data <- ms.data[, c("Sample", "Timepoint", amino.acids)]
hm.data <- hm.data %>% filter(Timepoint != 91)
hm.data$Timepoint <- ifelse(hm.data$Timepoint==70 | hm.data$Timepoint == 50, "Chronic",
ifelse(hm.data$Timepoint == 7, "Acute", "Pre"))
# Set the Timepoint as a factor and order the levels correctly
hm.data$Timepoint <- factor(hm.data$Timepoint, levels = c("Pre", "Acute", "Chronic"))
Next, we need to modify the hm.data data frame into a matrix format for the pheatmap function.
# First let's make sure the timepoints are ordered correctly
hm.data <- with(hm.data, hm.data[order(Timepoint),])
# Next we need to transpose the data frame since pheatmap wants features in rows and samples in columns
hm.data <- as.data.frame(t(hm.data))
# Here we are using sapply to make sure all the columns are of a numeric class
mat <- as.matrix(sapply(hm.data[3:nrow(hm.data),], as.numeric))
# Set row name to the Amino Acid names
rownames(mat) <- rownames(hm.data)[3:22]
# convert our matrix to log2 for better visualization
mat <- log2(mat)
# Create annotation data frames that link the columns and rows to their correct identifiers
my_col <- as.data.frame(t(hm.data[2,,drop=F]))
# We also need to reset the factors for this
my_col$Timepoint <- factor(my_col$Timepoint, levels = c("Pre", "Acute", "Chronic"))
my_row <- as.data.frame(row.names(hm.data)[3:22])
Now that we have our data in the correct format we can simply pass these to the pheatmap function to visualize.
# Creating a custom color ramp because I don't like the default one in pheatmap
colors <- c(min(mat),seq(-2,2,by=0.01),max(mat))
my_palette <- c("green",colorRampPalette(colors = c("blue", "white", "red"))
(n = length(colors)-3), "red")
# We will cluster the rows because we are interested in clusters of amino acids potentially predicting viremia outcomes later on
# Gaps are used on both columns and rows for easier visualization of groups
pheatmap(mat,
annotation_col = my_col,
cluster_rows = T,
cluster_cols = F,
show_colnames = F,
scale = "row",
color = my_palette,
gaps_col = c(20,40),
cutree_rows = 4)
Our dataset is too small for statistically rigorous decision trees but as a proof-of-concept we will perform one anyway.
The first thing we need to do is create a new data frame of just the information we need. In order to do this we are going to take the Chronic timepoint and classify samples as either “Low” or “High” viremia based on if their viremia at chronic infection is less than or greater than the median, respectively.
# Summarize data to obtain mean, media, standard deviation, and standard error
summ.ms.data <- ms.data %>%
filter(Timepoint == 50 | Timepoint == 70) %>%
summarize(avg = mean(VL), n = n(),
std = sd(VL), se = std/sqrt(n),
median = median(VL))
# Set Viral Load Group (VLG) based on median VL
vlg.labels <- ifelse(ms.data[ms.data$Timepoint==50 | ms.data$Timepoint == 70, "VL"] > summ.ms.data$median, "High", "Low")
names(vlg.labels) <- ms.data[ms.data$Timepoint==50 | ms.data$Timepoint == 70, "Sample"]
# Need to set all timepoints to use these grouping labels
for(x in unique(ms.data$Sample)) {
ms.data[ms.data$Sample == x, "VLG"] <- vlg.labels[x]
}
ms.data <- ms.data[, c(1:8, 636, 9:635)]
for(x in colnames(ms.data)[10:636]) {
ms.data[, x] <- as.numeric(ms.data[, x])
}
Next, we will make a new data frame for these analyses and split the data into a testing and training data frame. We are interested in examining pre-infection amino acid profiles to determine if there is a specific amino acid profile that can be predictive of lower viremia.
# Take only the pre-infection timepoints since we are trying to find a predictive amino acid profile
data <- ms.data %>% filter(Timepoint == 0 | Timepoint == -21)
data <- data[, c("VLG", amino.acids)]
for(x in amino.acids) {
data[, x] <- as.numeric(data[, x])
}
# Split the data into training and testing
set.seed(20220228)
set.seed(20220301)
inTrain = createDataPartition(data$VLG, p = .6)[[1]]
train = data[ inTrain,]
test = data[-inTrain,]
table(train$VLG)
##
## High Low
## 6 6
table(test$VLG)
##
## High Low
## 4 4
Now, we can use rpart to generate a decision tree.
model <- rpart(VLG~.,
data = train,
method="class",
control=rpart.control(minsplit=2, minbucket=1, cp=0.001))
Now let’s plot the decision tree to see what, if any, are the predictive amino acid profiles.
rpart.plot(model, type = 4, extra = 101, tweak = 1, clip.right.labs = F)
Interpreting a decision tree is composed of answering a series of Yes/No questions. For the decision tree generated here we can begin at the top most node and work our ways down the nodes.
Now that we have a decision tree we can attempt to predict the classes of the testing data frame. We only have 6 samples total in this data frame so it is not robust enough for definitive conclusions but may be indicative of SIV protection.
# Predict the test data set
pred <- predict(model, test, type = "class")
table(test$VLG, pred)
## pred
## High Low
## High 3 1
## Low 1 3
# Examine the confusion matrix
confusionMatrix(table(pred, test$VLG))
## Confusion Matrix and Statistics
##
##
## pred High Low
## High 3 1
## Low 1 3
##
## Accuracy : 0.75
## 95% CI : (0.3491, 0.9681)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.1445
##
## Kappa : 0.5
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.750
## Specificity : 0.750
## Pos Pred Value : 0.750
## Neg Pred Value : 0.750
## Prevalence : 0.500
## Detection Rate : 0.375
## Detection Prevalence : 0.500
## Balanced Accuracy : 0.750
##
## 'Positive' Class : High
##
So we can see that with this small data set we achieve 75% accuracy. However, due to the small number of samples this may just be an over fitting problem and is not robust enough to predict real world outcomes.
One thing we are interested in is the kinetics of amino acid concentrations during the course of SIV infection. To visualize this we will simply take our data and melt it into a format useful for rapid plotting.
For this set of visualizations we will utilize a couple extra libraries.
require(reshape2)
require(ggrepel)
require(ggsignif)
require(rstatix)
require(ggpubr)
In the hidden code block below we define a function to plot the selected amino acid.
generate_aa_plot <- function(x, df) {
# Filter input data frame to just be selected variable
tmp <- df %>% filter(variable == x)
tmp2 <- tmp
tmp2$value <- tmp2$value
# Summarize the data, used in plotting the mean and standard deviation ribbons
tmp2 <- tmp2 %>% group_by(Timepoint) %>%
summarize(avg = mean(value), n = n(),
std = sd(value), se = std/sqrt(n)) %>%
mutate(lower.ci = avg - qt(1 - (0.05 / 2), n - 1) * se,
upper.ci= avg + qt(1 - (0.05 / 2), n - 1) * se)
# paired t test with Benjamini-Hochberg correction
stat.test <- tmp %>%
pairwise_t_test(value ~ Timepoint,
comparisons = list(c("-21", "7"),
c("7", "70"),
c("-21", "70")),
paired = T,
p.adjust.method = "BH") %>%
add_xy_position(x = "Timepoint")
stat.test$group1 <- as.numeric(stat.test$group1)
stat.test$group2 <- as.numeric(stat.test$group2)
p <-
ggplot(tmp, aes(x = Timepoint, y = value, group = 1)) +
geom_point(aes(group = Sample),alpha=0.1) +
geom_line(aes(group = Sample),alpha=0.1) +
geom_ribbon(data = tmp2,
aes(x = Timepoint,
y = avg,
ymin = avg - std,
ymax = avg + std),
fill = "steelblue2",
alpha=0.5) +
geom_line(data = tmp2,
aes(x = Timepoint, y = avg),
color = "firebrick",
size = 1,
linetype = "dashed") +
ylab(paste(x,"Concentration (\U03BCM)")) +
scale_x_continuous(breaks = c(-21, 7, 70), labels = c("Pre", "Acute", "Chronic"))
p<- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
p <- p + stat_pvalue_manual(data = stat.test, xmin = "group1", xmax = "group2")
print(p)
}
Let’s create a new data frame from the original data for these plots. The significance levels were calculated with the pairwise_t_test function from the rstatix package with the parameters of paired = TRUE and the Benjamini-Hochberg correction for multiple testing.
# We only want to look at the pre-cytokine treatment timepoints
ms.siv <- ms.data
# Let's filter out unneeded info so we can melt the df
ms.siv <- ms.siv[, c(1, 2, 3, 5, 6:632)]
temp.df <- ms.siv %>% filter(Timepoint != 91)
temp.df <- temp.df[, c("Sample", "Timepoint", "Group", amino.acids)]
temp.df$Timepoint <- ifelse(temp.df$Timepoint == -21 | temp.df$Timepoint == 0, -21,
ifelse(temp.df$Timepoint == 7, 7, 70))
temp.melt <- melt(temp.df, id.vars = c("Sample", "Timepoint", "Group"))
Now that we have a good format for the data we will plot every amino acid.
for(aa in amino.acids) {
generate_aa_plot(aa, temp.melt)
}